We will do exploratory data analysis

In [68]:
#importing all necessary libaries at the top
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt 
import matplotlib.image as pltimage
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier #importing Decision Tree Classifier
from sklearn.model_selection import train_test_split #importing train_test_split function
from sklearn import metrics #import scikit-learn metrics module for accuracy calculation
In [47]:
df = pd.read_csv('apple_quality.csv')
df.head()
Out[47]:
A_id Size Weight Sweetness Crunchiness Juiciness Ripeness Acidity Quality
0 39 1.286738 -2.429525 0.956133 1.477542 4.786376 -3.382357 2.519347 good
1 79 -2.167462 -1.729668 1.292344 2.343027 0.313248 -0.907590 -1.787447 bad
2 119 -0.591987 -2.618652 -1.073479 1.114681 -2.536375 1.754758 -1.660639 bad
3 159 0.008339 -0.258309 -0.589022 2.216559 2.347407 -0.439728 -4.797639 good
4 199 -3.976937 -1.876774 -0.103706 1.482254 -0.416407 1.765281 2.718979 bad
In [48]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         100 non-null    int64  
 1   Size         100 non-null    float64
 2   Weight       100 non-null    float64
 3   Sweetness    100 non-null    float64
 4   Crunchiness  100 non-null    float64
 5   Juiciness    100 non-null    float64
 6   Ripeness     100 non-null    float64
 7   Acidity      100 non-null    float64
 8   Quality      100 non-null    object 
dtypes: float64(7), int64(1), object(1)
memory usage: 7.2+ KB
In [49]:
df
Out[49]:
A_id Size Weight Sweetness Crunchiness Juiciness Ripeness Acidity Quality
0 39 1.286738 -2.429525 0.956133 1.477542 4.786376 -3.382357 2.519347 good
1 79 -2.167462 -1.729668 1.292344 2.343027 0.313248 -0.907590 -1.787447 bad
2 119 -0.591987 -2.618652 -1.073479 1.114681 -2.536375 1.754758 -1.660639 bad
3 159 0.008339 -0.258309 -0.589022 2.216559 2.347407 -0.439728 -4.797639 good
4 199 -3.976937 -1.876774 -0.103706 1.482254 -0.416407 1.765281 2.718979 bad
... ... ... ... ... ... ... ... ... ...
95 3839 -1.262208 -1.911969 0.601879 0.972100 2.363918 0.220316 2.311175 bad
96 3879 1.368756 0.594596 3.014961 3.311564 1.156120 -4.129005 5.458590 good
97 3919 -1.102334 -3.224132 -0.778734 0.872523 0.240080 -1.472499 0.458893 good
98 3959 0.444592 0.290374 -1.973923 0.400662 2.991892 0.367348 -0.747903 bad
99 3999 0.278540 -1.715505 0.121217 -1.154075 1.266677 -0.776571 1.599796 good

100 rows × 9 columns

In [50]:
# Get the number of rows and columns
num_rows, num_columns = df.shape

# Print the result
print(f'The dataset has {num_rows} rows and {num_columns} columns.')
The dataset has 100 rows and 9 columns.
In [51]:
#checking the null values
df.isnull().any()
Out[51]:
A_id           False
Size           False
Weight         False
Sweetness      False
Crunchiness    False
Juiciness      False
Ripeness       False
Acidity        False
Quality        False
dtype: bool
In [72]:
# Distribution of quality  
sns.countplot(x='Quality', data=df)
plt.xlabel('Quality')
plt.title('Distribution of Apple Quality Categories')
Out[72]:
Text(0.5, 1.0, 'Distribution of Apple Quality Categories')
No description has been provided for this image
In [73]:
# Boxplots for continuous variables  
df.drop('A_id', axis=1).plot(kind='box')
plt.title('Distribution of Apple Attributes')
plt.xticks(rotation=45)
Out[73]:
(array([1, 2, 3, 4, 5, 6, 7]),
 [Text(1, 0, 'Size'),
  Text(2, 0, 'Weight'),
  Text(3, 0, 'Sweetness'),
  Text(4, 0, 'Crunchiness'),
  Text(5, 0, 'Juiciness'),
  Text(6, 0, 'Ripeness'),
  Text(7, 0, 'Acidity')])
No description has been provided for this image
In [75]:
# Encode Quality column  
df['Quality'] = df['Quality'].map({'good':1, 'bad':0})

# Heatmap with encoded quality
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation Heatmap')
Out[75]:
Text(0.5, 1.0, 'Correlation Heatmap')
No description has been provided for this image
In [80]:
# Sweetness vs Size  
sns.scatterplot(x='Sweetness', y='Size', hue='Quality',
                data=df)
Out[80]:
<Axes: xlabel='Sweetness', ylabel='Size'>
No description has been provided for this image
In [81]:
# Acidity vs Ripeness                    
sns.scatterplot(x='Acidity', y='Ripeness', 
                hue='Quality', data=df)
Out[81]:
<Axes: xlabel='Acidity', ylabel='Ripeness'>
No description has been provided for this image
In [82]:
# Compare bad vs good apples
fig, ax = plt.subplots(3,2, figsize=(12,12))
sns.boxplot(x="Quality", y="Size", data=df, ax=ax[0,0]) 
sns.boxplot(x="Quality", y="Weight", data=df, ax=ax[0,1])

sns.boxplot(x="Quality", y="Sweetness", data=df, ax=ax[1,0])
sns.boxplot(x="Quality", y="Crunchiness", data=df, ax=ax[1,1]) 

sns.boxplot(x="Quality", y="Ripeness", data=df, ax=ax[2,1])
Out[82]:
<Axes: xlabel='Quality', ylabel='Ripeness'>
No description has been provided for this image
In [83]:
sns.pairplot(df.drop('A_id', axis=1), hue='Quality')
Out[83]:
<seaborn.axisgrid.PairGrid at 0x1b5662e15d0>
No description has been provided for this image
In [52]:
#splitting dataset in features and target variables
feature_cols=['Size','Weight','Sweetness','Crunchiness','Juiciness','Ripeness','Acidity']
x=df[feature_cols] #all features
y=df.Quality
In [53]:
#Now splitting the dataset into training set and test set by using function train_test_splilit
#we need to pass the 3 parameters
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.3, random_state=1)
In [54]:
print("Training split input -",  x_train.shape)
print("Testing split input-", x_test.shape)
Training split input - (30, 7)
Testing split input- (70, 7)
In [55]:
#creating decision tree classifier object
clf= DecisionTreeClassifier()
In [56]:
#Train Decision Tree Classifier
clf = clf.fit(x_train, y_train)
In [57]:
#Print Text Representation. Exporting Decision Tree to the text representation can be useful
test_representation = tree.export_text(clf)
print(test_representation)
|--- feature_2 <= 0.20
|   |--- feature_3 <= 2.70
|   |   |--- feature_5 <= -1.41
|   |   |   |--- class: good
|   |   |--- feature_5 >  -1.41
|   |   |   |--- feature_3 <= -0.47
|   |   |   |   |--- feature_3 <= -0.86
|   |   |   |   |   |--- class: bad
|   |   |   |   |--- feature_3 >  -0.86
|   |   |   |   |   |--- class: good
|   |   |   |--- feature_3 >  -0.47
|   |   |   |   |--- class: bad
|   |--- feature_3 >  2.70
|   |   |--- class: good
|--- feature_2 >  0.20
|   |--- feature_4 <= -1.19
|   |   |--- class: bad
|   |--- feature_4 >  -1.19
|   |   |--- feature_3 <= 2.26
|   |   |   |--- class: good
|   |   |--- feature_3 >  2.26
|   |   |   |--- class: bad

In [58]:
#plotting the tree

fig = plt.figure(figsize=(250,200))
_ = tree.plot_tree(clf, feature_names = feature_cols, filled= True)
No description has been provided for this image
In [ ]:
 
In [59]:
print("Training split input -",  x_train.shape)
print("Testing split input-", x_test.shape)
print("Testing split output-", y_test.shape)
Training split input - (30, 7)
Testing split input- (70, 7)
Testing split output- (70,)
In [60]:
y_pred = clf.predict(x_test)
In [61]:
print(y_pred)
['bad' 'bad' 'bad' 'good' 'bad' 'bad' 'bad' 'bad' 'good' 'bad' 'bad' 'bad'
 'bad' 'good' 'good' 'bad' 'good' 'good' 'good' 'bad' 'good' 'bad' 'bad'
 'bad' 'good' 'bad' 'bad' 'good' 'bad' 'good' 'bad' 'good' 'bad' 'bad'
 'good' 'bad' 'bad' 'good' 'good' 'good' 'bad' 'bad' 'bad' 'bad' 'good'
 'bad' 'bad' 'good' 'good' 'bad' 'good' 'bad' 'bad' 'bad' 'bad' 'bad'
 'bad' 'bad' 'bad' 'good' 'bad' 'bad' 'bad' 'good' 'bad' 'good' 'bad'
 'bad' 'bad' 'bad']
In [62]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.6142857142857143
In [63]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) 
In [64]:
print("Training split input -",  x_train.shape)
print("Testing split input-", x_test.shape)
print("Testing split output-", y_test.shape)
Training split input - (80, 7)
Testing split input- (20, 7)
Testing split output- (20,)
In [65]:
y_pred = clf.predict(x_test)
In [66]:
print(y_pred)
['good' 'bad' 'good' 'good' 'good' 'good' 'good' 'bad' 'good' 'bad' 'good'
 'bad' 'bad' 'good' 'bad' 'bad' 'good' 'bad' 'bad' 'bad']
In [67]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 1.0

Accuracy is increased to 1.0

In [ ]: